avocado_data <- read_csv(url('https://raw.githubusercontent.com/michael-franke/intro-data-analysis/master/data_sets/avocado.csv')) %>%
# remove currently irrelevant columns
select( -X1 , - contains("Bags"), - year, - region) %>%
# rename variables of interest for convenience
rename(
total_volume_sold = `Total Volume`,
average_price = `AveragePrice`,
small = '4046',
medium = '4225',
large = '4770',
)
avocado_data %>%
ggplot(aes(x = log(total_volume_sold), y = average_price)) +
geom_point(color = "darkgray", alpha = 0.3) +
geom_smooth(color = "firebrick", method = "lm")
# function for the negative log-likelihood of the given
# data and fixed parameter values
nll = function(y, x, beta_0, beta_1, sd) {
# negative sigma is logically impossible
if (sd <= 0) {return( Inf )}
# predicted values
yPred = beta_0 + x * beta_1
# negative log-likelihood of each data point
nll = -dnorm(y, mean=yPred, sd=sd, log = T)
# sum over all observations
sum(nll)
}
fit_lh = optim(
# initial parameter values
par = c(1.5, 0, 0.5),
# function to optimize
fn = function(par) {
with(avocado_data,
nll(average_price, log(total_volume_sold),
par[1], par[2], par[3])
)
}
)
tibble(parameter = c("intercept", "slope", "standard deviation"), value = fit_lh$par)
## # A tibble: 3 x 2
## parameter value
## <chr> <dbl>
## 1 intercept 2.57
## 2 slope -0.102
## 3 standard deviation 0.327
avocado_data %>%
ggplot(aes(x = log(total_volume_sold), y = average_price)) +
geom_point(color = "darkgray", alpha = 0.3) +
geom_smooth(color = "firebrick", method = "lm") +
geom_abline(aes(slope = fit_lh$par[2], intercept = fit_lh$par[1]))
# select data to use
price <- as_data(avocado_data$average_price)
log_sold <- as_data(log(avocado_data$total_volume_sold))
# latent variables and priors
intercept <- student(df= 1, mu = 0, sigma = 10) # df = degree of freedom
slope <- student(df= 1, mu = 0, sigma = 10)
sigma <- normal(0 , 5, truncation = c(0, Inf))
# derived latent variable (linear model)
mean <- intercept + slope * log_sold
# likelihood
price <- normal(mean, sigma)
# finalize model, register which parameters to monitor
m <- model(intercept, slope, sigma)
draws <- greta::mcmc(
model = m,
n_samples = 2000,
warmup = 1000,
chains = 4
)
# cast results (type 'mcmc.list') into tidy tibble
tidy_draws <- ggmcmc::ggs(draws)
tidy_draws
## # A tibble: 24,000 x 4
## Iteration Chain Parameter value
## <int> <int> <fct> <dbl>
## 1 1 1 intercept -0.00875
## 2 2 1 intercept -0.00883
## 3 3 1 intercept -0.00907
## 4 4 1 intercept -0.00860
## 5 5 1 intercept -0.0102
## 6 6 1 intercept -0.00985
## 7 7 1 intercept -0.00932
## 8 8 1 intercept -0.00972
## 9 9 1 intercept -0.00908
## 10 10 1 intercept -0.00870
## # … with 23,990 more rows
# obtain Bayesian point and interval estimates
Bayes_estimates <- tidy_draws %>%
group_by(Parameter) %>%
summarise(
'|95%' = HDInterval::hdi(value)[1],
mean = mean(value),
'95|%' = HDInterval::hdi(value)[2]
)
Bayes_estimates
## # A tibble: 3 x 4
## Parameter `|95%` mean `95|%`
## <fct> <dbl> <dbl> <dbl>
## 1 intercept -0.0186 -0.00285 0.0123
## 2 sigma 0.0742 0.0776 0.0816
## 3 slope -0.00121 0.000106 0.00139
We infer from the posterior estimates of the slope parameter that there is no or a marginal linear relationship between (log of) volume sold and average price (since the slope lies around zero for the point valued estimates as well as the 95% credible interval).
# select data to use
price <- as_data(avocado_data$average_price)
log_sold <- as_data(log(avocado_data$total_volume_sold))
# latent variables and improper priors
intercept <- variable()
slope <- variable()
sigma <- variable(lower = 0)
# derived latent variable (linear model)
mean <- intercept + slope * log_sold
# likelihood
price <- normal(mean, sigma)
# finalize model, register which parameters to monitor
m <- model(intercept, slope, sigma)
# greta::opt(model = m)
opt_ex2 <- readRDS('/Users/florian/Documents/Studium/Semester_3/Statistics and Data analysis/opt_ex2.rds')
opt_ex2
## $par
## $par$intercept
## [1] 2.565098
##
## $par$slope
## [1] -0.1024293
##
## $par$sigma
## [1] 0.3270453
##
##
## $value
## [1] 5497.596
##
## $iterations
## [1] 15
##
## $convergence
## [1] 0
The output values returned by the greta optimizer are the best fitting parameters for our model (intercept = 2.565, slope = -0.102, standard deviation = 0.327). We estimated these values earlier: they are the parameters which maximize the likelihood function of our gaussian distributed model.
data_KoF_cleaned <- read_csv(url('https://raw.githubusercontent.com/michael-franke/intro-data-analysis/master/data_sets/king-of-france_data_cleaned.csv'))
# k0 and N0 for condition 0
k0 <- data_KoF_cleaned %>%
filter(condition == "Condition 0") %>%
group_by(condition) %>%
filter(response == "TRUE") %>%
nrow()
k0
## [1] 7
N0 <- data_KoF_cleaned %>% filter(condition == "Condition 0") %>% nrow()
N0
## [1] 75
# k1 and N1 for condition 1
k1 <- data_KoF_cleaned %>%
filter(condition == "Condition 1") %>%
group_by(condition) %>%
filter(response == "TRUE") %>%
nrow()
k1
## [1] 3
N1 <- data_KoF_cleaned %>% filter(condition == "Condition 1") %>% nrow()
N1
## [1] 79
# declare as greta data arrays
y0 <- as_data(k0)
y1 <- as_data(k1)
# priors
theta_0 <- beta(1, 1)
theta_1 <- beta(1, 1)
# derived prameters
delta <- abs(theta_0 - theta_1)
# likelihood
distribution(y0) <- binomial(N0, theta_0)
distribution(y1) <- binomial(N1, theta_1)
# model
m <- model(theta_0, theta_1, delta)
draws <- greta::mcmc(m, warmup = 1000, n_samples = 2000)
tidy_draws <- ggmcmc::ggs(draws)
tidy_draws
## # A tibble: 24,000 x 4
## Iteration Chain Parameter value
## <int> <int> <fct> <dbl>
## 1 1 1 delta 0.0129
## 2 2 1 delta 0.00809
## 3 3 1 delta 0.0182
## 4 4 1 delta 0.00598
## 5 5 1 delta 0.00596
## 6 6 1 delta 0.00134
## 7 7 1 delta 0.0142
## 8 8 1 delta 0.0823
## 9 9 1 delta 0.134
## 10 10 1 delta 0.0992
## # … with 23,990 more rows
Bayes_estimates <- tidy_draws %>%
group_by(Parameter) %>%
summarise(
'|95%' = HDInterval::hdi(value)[1],
mean = mean(value),
'95|%' = HDInterval::hdi(value)[2]
)
Bayes_estimates
## # A tibble: 3 x 4
## Parameter `|95%` mean `95|%`
## <fct> <dbl> <dbl> <dbl>
## 1 delta 0.0000667 0.0574 0.124
## 2 theta_0 0.0390 0.103 0.170
## 3 theta_1 0.0102 0.0494 0.0958
The bayesian estimates of parameter delta suggest that one is more likely to respond “true” for false implicit presuppositions (condition 0) then for false explicit assertions (condition 1). The upper bound of the estimated difference between the biases for responding with “true” lies around 13% and is about 6% on average.